
Coronavirus is a family of viruses that can cause illness, which can vary from common cold and cough to sometimes more severe disease. Middle East Respiratory Syndrome (MERS-CoV) and Severe Acute Respiratory Syndrome (SARS-CoV) were such severe cases with the world already has faced.
SARS-CoV-2 (n-coronavirus) is the new virus of the coronavirus family, which first discovered in 2019, which has not been identified in humans before. It is a contiguous virus which started from Wuhan in December 2019. Which later declared as Pandemic by WHO due to high rate spreads throughout the world. Currently (on date 23rd of April 2020), this leads to a total of 189K+ Deaths across the globe, including 110K+ deaths alone in Europe.
Pandemic is spreading all over the world; it becomes more important to understand about this spread. This NoteBook is an effort to analyze the data of confirmed, deaths, and recovered cases over time. In this notebook, the main focus is to analyze the spread trend of this virus all over the world.
2020 Educational and Population Global Data Repository by UNESCO UIS Statistics
This dataset is updated on annual basis by UNESCO UIS Statistics
pip install pycountry
pip install empiricaldist
pip install plotly_express
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import math
import pycountry
import pycountry_convert as pc
from plotly.subplots import make_subplots
import plotly_express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import empiricaldist as emp
%matplotlib inline
sns.set_style('darkgrid')
import warnings
warnings.filterwarnings('ignore')
Three Time-series dataset :
def ecdf(data):
#credits DataCamp Justin Bois
"""Compute ECDF for a one-dimensional array of measurements."""
# Number of data points: n
n = len(data)
# x-data for the ECDF: x
x = np.sort(data)
# y-data for the ECDF: y
y = np.arange(1, n+1) / n
return x, y
def country_pick(main_df,country_name,startdate):
"""Filters Selected Dataframe using country name"""
df = main_df[main_df['country'] == country_name].reset_index().drop('index',axis=1)
df = df[df.Date >= startdate]
return df
def pxplotline(main_df,sub_df,y,x ='Date',title='No Title',hd=['pop']):
"""Plots line plot using plotly_express from selected Dataframe"""
df = main_df.groupby(['country','Date','confirmed%','mortality%'],as_index=False)['confirmed','death','recovered','active'].sum()
df = df.merge(sub_df,on='country')
df.drop(['Date_y','confirmed_y','death_y','recovered_y','active_y','confirmed%_y','mortality%_y'],axis=1,inplace=True)
df.rename(columns={'Date_x':'Date','confirmed_x':'confirmed','death_x':'death','recovered_x':'recovered','active_x':'active','confirmed%_x':'confirmed%','mortality%_x':'mortality%'},inplace=True)
fig = px.line(df,x = x, y = y, color='country',title = title,hover_data=hd)
fig.show()
def mplotbar_single_country(main_df,country_name,startdate,y,title='No Title'):
"""Plots bar plot using seaborn for a single country from selected Dataframe"""
data = country_pick(main_df,country_name,startdate)
fig, ax = plt.subplots(figsize = (20,10))
fig2 = sns.barplot(data = data, x = 'Date', y = y, ax = ax,color = '#6495ED')
ax.set_xticklabels(labels=data.Date.dt.strftime('%Y-%m-%d'), rotation=45, ha='right')
plt.title(title)
def mplotline_single_country(main_df,country_name,startdate,y,title='No Title'):
"""Plots line plot using seaborn for a single country from selected Dataframe"""
data = country_pick(main_df,country_name,startdate)
fig, ax = plt.subplots(figsize = (20,10))
fig = sns.lineplot(data = data , x = 'Date',y = y,marker = 'o',ax = ax)
ax.set(xticks=data.Date.values)
_=ax.set_xticklabels(labels=data.Date.dt.strftime('%m-%d'), rotation=45)
plt.title(title)
def mplotline_list_country(main_df,country_names,startdate,y,fig=(20,10)):
"""Plots line plot using seaborn for a list of countries from selected Dataframe"""
fig, ax = plt.subplots(figsize = fig)
for i in country_names:
df = main_df[main_df['country'] == i]
df = df[df.Date >= startdate]
fig = sns.lineplot(data = df , x = 'Date',y = y,marker = '.',ax = ax,label = i)
ax.set(xticks=df.Date.values)
_=ax.set_xticklabels(labels=df.Date.dt.strftime('%m-%d'), rotation=45)
plt.legend()
def add_daily(df):
"""Adds columns of daily counts increase to selected Dataframe"""
df.loc[0,'daily_confirmed'] = df.loc[0,'confirmed']
df.loc[0,'daily_death'] = df.loc[0,'death']
df.loc[0,'daily_recovered'] = df.loc[0,'recovered']
df.loc[0,'daily_active'] = df.loc[0,'active']
for i in range(1,len(df)):
df.loc[i,'daily_confirmed'] = df.loc[i,'confirmed'] - df.loc[i-1,'confirmed']
df.loc[i,'daily_death'] = df.loc[i,'death'] - df.loc[i-1,'death']
df.loc[i,'daily_recovered'] = df.loc[i,'recovered'] - df.loc[i-1,'recovered']
df.loc[i,'daily_active'] = df.loc[i,'active'] - df.loc[i-1,'active']
df.loc[0,'daily_confirmed'] = 0
df.loc[0,'daily_death'] = 0
df.loc[0,'daily_recovered'] = 0
df.loc[0,'daily_active'] = 0
return df
def gplotbar(main_df,countryname,cols,startdate='1/1/2020',daily=False,title='No Title'):
"""Plots bar plot using plotly_express for a multiple countries from selected Dataframe"""
if daily == True:
if countryname == 'all':
df = add_daily(main_df.groupby('Date',as_index=False).sum())
df = df[df.Date >= startdate]
data=[]
for i in cols:
data.append(go.Bar(name = f'daily_{i}',x = df['Date'], y = df[f'daily_{i}']))
fig = go.Figure(data=data)
fig.update_layout(barmode='overlay', title=title)
fig.show()
else:
df = add_daily(main_df[main_df['country'] == countryname].groupby('Date',as_index=False).sum())
df = df[df.Date >= startdate]
data=[]
for i in cols:
data.append(go.Bar(name = f'daily_{i}',x = df['Date'], y = df[f'daily_{i}']))
fig = go.Figure(data=data)
fig.update_layout(barmode='overlay', title=title)
fig.show()
else:
if countryname == 'all':
df = main_df.groupby('Date',as_index=False).sum()
df = df[df.Date >= startdate]
data=[]
for i in cols:
data.append(go.Bar(name = i,x = df['Date'], y = df[i]))
fig = go.Figure(data=data)
fig.update_layout(barmode='overlay', title=title)
fig.show()
else:
df = main_df[main_df['country'] == countryname].groupby('Date',as_index=False).sum()
df = df[df.Date >= startdate]
data=[]
for i in cols:
data.append(go.Bar(name = i,x = df['Date'], y = df[i]))
fig = go.Figure(data=data)
fig.update_layout(barmode='overlay', title=title)
fig.show()
def get_country_details(country):
"""Returns country ISO and continent"""
try:
country_obj = pycountry.countries.get(name=country)
if country_obj is None:
c = pycountry.countries.search_fuzzy(country)
country_obj = c[0]
continent_code = pc.country_alpha2_to_continent_code(country_obj.alpha_2)
continent = pc.convert_continent_code_to_continent_name(continent_code)
return country_obj.alpha_3, continent
except:
if 'Congo' in country:
country = 'Congo'
elif country == 'Diamond Princess' or country == 'Laos' or country == 'MS Zaandam'\
or country == 'Holy See' or country == 'Timor-Leste':
return country, country
elif country == 'Korea, South' or country == 'South Korea':
country = 'Korea, Republic of'
elif country == 'Taiwan*':
country = 'Taiwan'
elif country == 'Burma':
country = 'Myanmar'
elif country == 'West Bank and Gaza':
country = 'Gaza'
else:
return country, country
country_obj = pycountry.countries.search_fuzzy(country)
continent_code = pc.country_alpha2_to_continent_code(country_obj[0].alpha_2)
continent = pc.convert_continent_code_to_continent_name(continent_code)
return country_obj[0].alpha_3, continent
def count_cat(n):
"""Returns catagorical group of a number"""
if n < 25:
return '< 25'
elif (n >= 25) & (n <= 50):
return '< 50'
elif (n >= 50) & (n <= 100):
return '< 100'
elif (n >= 100) & (n <= 200):
return '< 200'
elif (n >= 200) & (n <= 1000):
return '< 1000'
elif (n >= 1000) & (n <= 5000):
return '< 5000'
elif (n >= 5000) & (n <= 10000):
return '< 10,000'
elif (n >= 10000) & (n <=30000):
return '< 30,000'
elif (n >= 30000) & (n <= 100000):
return '< 100,000'
elif (n >= 100000) & (n <= 150000):
return '< 150,000'
elif (n >= 150000) & (n <= 200000):
return '< 200,000'
elif (n >= 200000) & (n <= 250000):
return '< 250,000'
elif (n >= 250000) & (n <= 300000):
return '< 300,000'
elif (n >= 300000) & (n <= 400000):
return '< 400,000'
elif (n >= 400000) & (n <= 500000):
return '< 500,000'
else:
return '> 500,000'
def convert(pop):
"""Converts Dataframe column to float"""
if pop == float('nan'):
return 0.0
return float(pop.replace(',',''))
#Importing datasets of COVID-19 Confirmed, Death and Recovered counts
confirmed_cases = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv')
death_cases = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv')
recovered_cases = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv')
#Importing datasets of education and educational population
ed = pd.read_csv('D:\DATASCIENCE\Project 1\COVID-19\Datasets\\education_illiteracy.csv')
pop = pd.read_csv('D:\DATASCIENCE\Project 1\COVID-19\Datasets\\educational_population.csv')
#Importing datasets of world population and density
worldpop = pd.read_csv('D:\DATASCIENCE\Project 1\COVID-19\Datasets\\world_population.csv')
display(confirmed_cases.head())
display(confirmed_cases.describe())
display(confirmed_cases.info())
display(worldpop.head())
display(worldpop.describe())
display(worldpop.info())
display(ed.head())
display(ed.describe())
display(ed.info())
#Filtering using Indicators
ed = ed[ed.Indicator == 'Youth illiterate population, 15-24 years, both sexes (number)']
#Filtering & renaming important columns
ed = ed.drop(['EDULIT_IND','Indicator','LOCATION','Time','Flag Codes','Flags'],axis =1).rename(columns={'Country':'country','TIME':'year','Value':'illiterate'})
#Drop missing values
dropped = ed[ed['illiterate'] >= 0]
#Getting the mean of the available data
max_ed = dropped.groupby(['country']).mean()
#Making sure year is int
max_ed['year'] = max_ed['year'].apply(math.trunc)
#Reseting index
max_ed = max_ed.reset_index()
#Filtering & renaming important columns
pop = pop[(pop.Indicator == 'School age population, upper secondary education, both sexes (number)') | (pop.Indicator =='School age population, tertiary education, both sexes (number)')]
pop = pop.drop(['EDULIT_IND','Indicator','LOCATION','Time','Flag Codes','Flags'],axis =1).rename(columns={'Country':'country','TIME':'year','Value':'pop'})
pop = pop.groupby(['country','year'],as_index = False).sum()
#Merging educational data
edu_df = max_ed.merge(pop,on = ['country','year'])
edu_df['illiterate%'] = (edu_df['illiterate'] * 100) / (edu_df['pop'])
edu_df.drop([46,75,123],axis=0,inplace = True)
edu_df = edu_df.reset_index(drop=True)
edu_df['ISO'] = 'ISO'
edu_df["continent"] = 'continent'
for i in range(len(edu_df)):
if edu_df['country'][i] == 'Sint Maarten':
edu_df['ISO'][i] = 'NLSX'
edu_df["continent"][i] = 'Europe'
elif edu_df['country'][i] == 'North Korea':
edu_df['ISO'][i] = 'PRK'
edu_df["continent"][i] = 'Asia'
else:
edu_df['ISO'][i] = get_country_details(edu_df['country'][i])[0]
edu_df['continent'][i] = get_country_details(edu_df['country'][i])[1]
display(edu_df.head())
display(edu_df.describe())
display(edu_df.info())
#world population and country information dataframe
#Renaming columns in worldpop df to simplify use
worldpop.rename(columns={'Country (or dependent territory)':'country','Population':'pop','Density pop./km2':'density pop/km2'},inplace=True)
#Selecting columns of interest
worldpop = worldpop[['country','pop','density pop/km2']]
#Adding columns to use as reference
worldpop["ISO"] = 'ISO'
worldpop["continent"] = 'continent'
for i in range(len(worldpop)):
if worldpop['country'][i] == 'Sint Maarten':
worldpop['ISO'][i] = 'NLSX'
worldpop["continent"][i] = 'Europe'
elif worldpop['country'][i] == 'North Korea':
worldpop['ISO'][i] = 'PRK'
worldpop["continent"][i] = 'Asia'
else:
worldpop['ISO'][i] = get_country_details(worldpop['country'][i])[0]
worldpop['continent'][i] = get_country_details(worldpop['country'][i])[1]
worldpop['density pop/km2'] = worldpop.apply(lambda x: convert(x['density pop/km2']),axis=1)
worldpop['pop'] = worldpop.apply(lambda x: convert(x['pop']),axis=1)
#Selecting columns of interest
worldpop = worldpop[['ISO','pop','density pop/km2','continent']]
worldpop.drop(191,axis = 0,inplace=True)
worldpop.drop(124,axis = 0,inplace=True)
#confirmed cases dataframe cleaning
#dropping columns insted of selecting many columns of interest
confirmed_cases.drop(['Lat','Long','Province/State'],axis = 1,inplace=True)
#Renaming columns in confirmed cases df to simplify use
confirmed_cases.rename(columns={'Country/Region':'country'},inplace=True)
#Creating column ISO for referencing
confirmed_cases['ISO'] ='ISO'
for i in range(len(confirmed_cases)):
confirmed_cases['ISO'][i] = get_country_details(confirmed_cases['country'][i])[0]
#transforming df through groupby and melt to reshape date columns
confirmed_cases = confirmed_cases.groupby(['country','ISO'],as_index=False).sum()
confirmed_cases = confirmed_cases.melt(id_vars=['country','ISO'],var_name='Date',value_name='confirmed')
#creating catagorical column to simplify distribution analysis
confirmed_cases['confirmed_cat'] = 'BASE'
for i in range(len(confirmed_cases)):
confirmed_cases['confirmed_cat'][i] = count_cat(confirmed_cases['confirmed'][i])
confirmed_cases['confirmed_cat'] = pd.Categorical(confirmed_cases['confirmed_cat'],categories=['< 25','< 50','< 100','< 200','< 1000','< 5000','< 10,000','< 30,000','< 100,000','< 150,000','< 200,000','< 250,000','< 300,000','< 400,000','< 500,000','> 500,000'],ordered=True)
#death cases dataframe
#dropping columns insted of selecting many columns of interest
death_cases.drop(['Lat','Long','Province/State'],axis = 1,inplace=True)
#Renaming columns in death cases df to simplify use
death_cases.rename(columns={'Country/Region':'country'},inplace=True)
#transforming df through groupby and melt to reshape date columns
death_cases = death_cases.groupby('country',as_index=False).sum()
death_cases = death_cases.melt(id_vars='country',var_name='Date',value_name='death')
#creating catagorical column to simplify distribution analysis
death_cases['death_cat'] = 'BASE'
for i in range(len(death_cases)):
death_cases['death_cat'][i] = count_cat(death_cases['death'][i])
death_cases['death_cat'] = pd.Categorical(death_cases['death_cat'],categories=['< 25','< 50','< 100','< 200','< 1000','< 5000','< 10,000','< 30,000','< 100,000','< 150,000','< 200,000','< 250,000','< 300,000','< 400,000','< 500,000','> 500,000'],ordered=True)
#recovered cases dataframe
#dropping columns insted of selecting many columns of interest
recovered_cases.drop(['Lat','Long','Province/State'],axis = 1,inplace=True)
#Renaming columns in recovered cases df to simplify use
recovered_cases.rename(columns={'Country/Region':'country'},inplace=True)
#transforming df through groupby and melt to reshape date columns
recovered_cases = recovered_cases.groupby('country',as_index=False).sum()
recovered_cases = recovered_cases.melt(id_vars='country',var_name='Date',value_name='recovered')
#creating catagorical column to simplify distribution analysis
recovered_cases['recovered_cat'] = 'BASE'
for i in range(len(recovered_cases)):
recovered_cases['recovered_cat'][i] = count_cat(recovered_cases['recovered'][i])
recovered_cases['recovered_cat'] = pd.Categorical(recovered_cases['recovered_cat'],categories=['< 25','< 50','< 100','< 200','< 1000','< 5000','< 10,000','< 30,000','< 100,000','< 150,000','< 200,000','< 250,000','< 300,000','< 400,000','< 500,000','> 500,000'],ordered=True)
# Main df (full_df) with combined dfs using merge
#using ISO as base to reference worldpop df
full_df = confirmed_cases
full_df = full_df.merge(death_cases)
full_df = full_df.merge(recovered_cases)
#merging on ISO
full_df = full_df.merge(worldpop,on = 'ISO')
#converting date column to date object
full_df.Date = pd.to_datetime(full_df.Date,format = '%m/%d/%y')
#creating active cases column and its catagorical column to simplify distribution analysis
#initializing columns
full_df['active'] = 0
full_df['active_cat'] = 'BASE'
#Calculating values
for i in range(len(full_df)):
full_df['active'][i] = (full_df['confirmed'][i]) - (full_df['death'][i] + full_df['recovered'][i])
full_df['active_cat'][i] = count_cat(full_df['active'][i])
#Categorising column
full_df['active_cat'] = pd.Categorical(full_df['active_cat'],categories=['< 25','< 50','< 100','< 200','< 1000','< 5000','< 10,000','< 30,000','< 100,000','< 150,000','< 200,000','< 250,000','< 300,000','< 400,000','< 500,000','> 500,000'],ordered=True)
#adding mortality percentage, confirmed cases to population percentage and active to confirmed percentage
#initializing columns
full_df['mortality%'] = 'mort'
full_df['confirmed%'] = 'per'
full_df['active%'] = 'perc'
#creating columns
for i in range(len(full_df)):
full_df['confirmed%'][i] = round(((100 * full_df.confirmed[i]) / full_df['pop'][i]), 4)
if full_df.confirmed[i] == 0:
full_df['mortality%'][i] = 0
full_df['active%'][i] = 0
else:
full_df['mortality%'][i] = (100 * full_df.death[i]) / full_df.confirmed[i]
full_df['active%'][i] = full_df['active'][i] * 100 / full_df['confirmed'][i]
#Converting Datatypes of added columns to floats
full_df['mortality%'] = pd.to_numeric(full_df['mortality%'], downcast="float")
full_df['confirmed%'] = pd.to_numeric(full_df['confirmed%'], downcast="float")
full_df['active%'] = pd.to_numeric(full_df['active%'], downcast="float")
display(full_df.head())
display(full_df.describe())
display(full_df.info())
full_df.corr().style.background_gradient(cmap='Blues')
#Using ecdf to compute the CDF
x1,y1 = list(ecdf(full_df[(full_df.Date == full_df.Date.min())].confirmed))
x,y = list(ecdf(full_df[(full_df.Date == full_df.Date.max())].confirmed))
#Create a subplot to fit two axis
fig = make_subplots(rows=1, cols=2,subplot_titles=(f'''Cumulative distribution function on {full_df.Date.min().strftime('%m/%d/%Y')}''', f'''Cumulative distribution function on {full_df.Date.max().strftime("%m/%d/%Y")}'''))
#add first plot at the minimum date recorded
fig.add_trace(
go.Scatter(x= x1,y = y1,name = f'''{full_df.Date.min().strftime('%m/%d/%Y')}'''),
row=1, col=1
)
#add second plot at the maximum date recorded
fig.add_trace(
go.Scatter(x = x,y = y,name = f'''{full_df.Date.max().strftime('%m/%d/%Y')}'''),
row=1, col=2
)
#control title and figure dimentions
fig.update_layout(height=500, width=1000, title_text="Cumulative distribution functions")
fig.show()
Plotting probability mass function for confirmed cases distribution through the four quantiles in the data, to observe the evolution of spread over time.
# Set up the matplotlib figure
f, axes = plt.subplots(2, 2, figsize=(15, 15),sharex=True);
_=sns.despine(left=True);
#distribution of data at 4 quantiles of Dates
for i , j in {0.25:axes[0,0],0.5:axes[0,1],0.75:axes[1,0],1:axes[1,1]}.items():
d = full_df[full_df.Date == full_df.Date.quantile(i).strftime('%m/%d/%Y')].sort_values(by = 'confirmed_cat');
_=sns.catplot(data = d, x="confirmed_cat", kind="count", palette="ch:.25", ax=j);
_=j.title.set_text(f'''Distribution of confirmed cases at {full_df.Date.quantile(i).strftime('%m/%d/%Y')} (PMF)''');
plt.close()
#Rotating x labels
for axes in f.axes:
plt.sca(axes)
plt.xticks(rotation=90)
Plotting probability mass function for death cases distribution through the four quantiles in the data, to observe the evolution of spread over time.
# Set up the matplotlib figure
f, axes = plt.subplots(2, 2, figsize=(15, 15),sharex=True);
_=sns.despine(left=True);
#distribution of data at 4 quantiles of Dates
for i , j in {0.25:axes[0,0],0.5:axes[0,1],0.75:axes[1,0],1:axes[1,1]}.items():
d = full_df[full_df.Date == full_df.Date.quantile(i).strftime('%m/%d/%Y')].sort_values(by = 'death_cat');
_=sns.catplot(data = d, x="death_cat", kind="count", palette="ch:.25", ax=j);
_=j.title.set_text(f'''Distribution of death cases at {full_df.Date.quantile(i).strftime('%m/%d/%Y')} (PMF)''');
plt.close()
#Rotating x labels
for axes in f.axes:
plt.sca(axes)
plt.xticks(rotation=90)
Plotting probability mass function for recovered cases distribution through the four quantiles in the data, to observe the evolution of spread over time.
# Set up the matplotlib figure
f, axes = plt.subplots(2, 2, figsize=(15, 15),sharex=True);
_=sns.despine(left=True);
#distribution of data at 4 quantiles of Dates
for i , j in {0.25:axes[0,0],0.5:axes[0,1],0.75:axes[1,0],1:axes[1,1]}.items():
d = full_df[full_df.Date == full_df.Date.quantile(i).strftime('%m/%d/%Y')].sort_values(by = 'recovered_cat');
_=sns.catplot(data = d, x="recovered_cat", kind="count", palette="ch:.25", ax=j);
_=j.title.set_text(f'''Distribution of recovered cases at {full_df.Date.quantile(i).strftime('%m/%d/%Y')} (PMF)''');
plt.close()
#Rotating x labels
for axes in f.axes:
plt.sca(axes)
plt.xticks(rotation=90)
Plotting probability mass function for active cases distribution through the four quantiles in the data, to observe the evolution of spread over time.
# Set up the matplotlib figure
f, axes = plt.subplots(2, 2, figsize=(15, 15),sharex=True);
_=sns.despine(left=True);
#distribution of data at 4 quantiles of Dates
for i , j in {0.25:axes[0,0],0.5:axes[0,1],0.75:axes[1,0],1:axes[1,1]}.items():
d = full_df[full_df.Date == full_df.Date.quantile(i).strftime('%m/%d/%Y')].sort_values(by = 'active_cat');
_=sns.catplot(data = d, x="active_cat", kind="count", palette="ch:.25", ax=j);
_=j.title.set_text(f'''Distribution of active cases at {full_df.Date.quantile(i).strftime('%m/%d/%Y')} (PMF)''');
plt.close()
#Rotating x labels
for axes in f.axes:
plt.sca(axes)
plt.xticks(rotation=90)
global_latest_count = full_df[full_df.Date == full_df.Date.max()].groupby('Date').sum()[['confirmed','death','recovered','active']]
global_latest_count['mortality%'] = global_latest_count.death * 100 / global_latest_count.confirmed
display(global_latest_count)
global_latest_count = full_df[full_df.Date == full_df.Date.max()].groupby('continent').sum()[['confirmed','death','recovered','active']]
global_latest_count['mortality%'] = global_latest_count.death * 100 / global_latest_count.confirmed
display(global_latest_count[global_latest_count.confirmed>1000].style.background_gradient(cmap='Blues',subset=["confirmed"])\
.background_gradient(cmap='Reds',subset=["death"])\
.background_gradient(cmap='Greens',subset=["recovered"])\
.background_gradient(cmap='YlOrBr',subset=["mortality%"])\
.background_gradient(cmap='Purples',subset=["active"])
)
global_latest_count = full_df[full_df.Date == full_df.Date.max()].groupby('country').sum()[['confirmed','death','recovered','active']]
global_latest_count['mortality%'] = global_latest_count.death * 100 / global_latest_count.confirmed
display(global_latest_count.sort_values(by = 'confirmed',ascending = False).style.background_gradient(cmap='Blues',subset=["confirmed"])\
.background_gradient(cmap='Reds',subset=["death"])\
.background_gradient(cmap='Greens',subset=["recovered"])\
.background_gradient(cmap='YlOrBr',subset=["mortality%"])\
.background_gradient(cmap='Purples',subset=["active"])
)
Since, cases and deaths have grown exponentially over the past three months through out the world, I have plotted the choropleth map on logarithmic scale. You can hover on the country to know the total confirmed cases or deaths.
#subsetting from full_df for mapping with log scale
world_df = full_df.groupby(['country','Date','ISO','mortality%','confirmed%'],as_index=False).sum()
world_df['Date'] = world_df.Date.apply(lambda x: x.date()).apply(str)
world_df['ln_confirmed'] = np.log(world_df.confirmed + 1)
world_df['ln_death'] = np.log(world_df.death + 1)
world_df['ln_recovered'] = np.log(world_df.recovered + 1)
world_df['ln_mortality%'] = np.log(world_df['mortality%'] + 1)
world_df['ln_active'] = np.log(world_df['active'] + 1)
px.choropleth(world_df,
locations="ISO",
color="ln_confirmed",
hover_name="country",
hover_data=["death"] ,
animation_frame="Date",
color_continuous_scale='Purples',title='Confirmed Cases Worldwide (log scale)')
px.choropleth(world_df,
locations="ISO",
color="ln_death",
hover_name="country",
hover_data=["recovered"] ,
animation_frame="Date",
color_continuous_scale='Reds',title='Death Cases Worldwide (log scale)')
px.choropleth(world_df,
locations="ISO",
color="ln_recovered",
hover_name="country",
hover_data=["death"] ,
animation_frame="Date",
color_continuous_scale='Greens',title='Recovered Cases Worldwide (log scale)')
px.choropleth(world_df,
locations="ISO",
color='active',
hover_name="country",
hover_data=["death"] ,
animation_frame="Date",
color_continuous_scale='amp',title='Active Cases Worldwide (log scale)')
df_data = full_df.groupby(['country','Date'],as_index = False)['confirmed','death'].max()
df_data["Date"] = pd.to_datetime( df_data["Date"]).dt.strftime('%m/%d/%Y')
fig = px.scatter_geo(df_data, locations="country", locationmode='country names',
color=df_data["confirmed"],
size= np.power(df_data["confirmed"]+1,0.3)-1,
hover_name="country",
hover_data=["confirmed"],
range_color= [0, max(df_data["confirmed"])-1],
animation_frame="Date",
color_continuous_scale=px.colors.sequential.Plasma,
title='Virus Spread through time (confirmed cases)'
)
fig.update_coloraxes(colorscale="hot")
fig.update(layout_coloraxis_showscale=False)
fig.show()
Finding top 10 countries affected. Since, the Confirmed cases and Deaths are the cummulative sums till date. Adding daily counts is recommended.
Starting off with worldwide data analysis using bar plots to get a general sense of how data growth seems to behave.
gplotbar(full_df,cols=['confirmed','active','recovered','death'],title='Worldwide total Cases, Recoveries and Deaths counts',countryname='all')
gplotbar(full_df,daily=True,countryname='all',cols=['confirmed','active','recovered','death'],title='Worldwide Daily Cases, Recoveries and Deaths counts')
df_temp = full_df.melt(id_vars = ['country','Date','ISO','confirmed%','mortality%','pop','confirmed_cat','death_cat','recovered_cat','density pop/km2','continent','active_cat','active%'],var_name = 'Case Type',value_name='count').groupby(['Date','Case Type'],as_index=False).sum()[['Date','Case Type','count']]
px.line(df_temp,x = "Date", y = 'count',color = 'Case Type',title='Worldwide Cases, Recoveries and Deaths counts')
df_temp = add_daily(full_df.groupby('Date',as_index=False).sum()[['Date','confirmed','death','recovered','active']])[['Date','daily_confirmed','daily_death','daily_recovered','daily_active']]
df_temp = df_temp.melt(id_vars = ['Date'],var_name = 'Case Type',value_name='count').groupby(['Date','Case Type'],as_index=False).sum()[['Date','Case Type','count']]
px.line(df_temp,x = "Date", y = 'count',color = 'Case Type',title='Worldwide Daily Cases, Recoveries and Deaths counts')
Figuring out top countries with confirmed, death, recovered, active cases and deduce trends and conclusions from.
TIP : Click on US in the legend of the graph to have a more clear view of the data.
sub_df = full_df[full_df.Date == full_df.Date.max()].nlargest(10,'confirmed')
pxplotline(full_df,sub_df,'confirmed',x ='Date',title='Total # Cases for top 10 affected countries')
# sub_df.country.apply(get_country_details)
sub_df = full_df[full_df.Date == full_df.Date.max()].nlargest(10,'death')
pxplotline(full_df,sub_df,'death',x ='Date',title='Total # Deaths for top 10 affected countries')
#sub_df.country.apply(get_country_details)
sub_df = full_df[full_df.Date == full_df.Date.max()].nlargest(10,'recovered')
pxplotline(full_df,sub_df,'recovered',x ='Date',title='Total # Recovered for top 10 affected countries')
#sub_df.country.apply(get_country_details)
sub_df = full_df[full_df.Date == full_df.Date.max()].nlargest(10,'active')
pxplotline(full_df,sub_df,'active',x ='Date',title='Total # Active for top 10 affected countries')
#sub_df.country.apply(get_country_details)
Finding the most active and least active in the top ten most affected countries is reflective of how the virus envelops one country after the other and how some countries seemed to overcome the challange.
least_active = full_df[full_df.Date == full_df.Date.max()].sort_values('confirmed',ascending = False)[['country','confirmed','death','recovered','active','active%']]
least_active = least_active.nlargest(10,'confirmed')
least_active = least_active.sort_values(by = 'active%')
display(least_active.style.background_gradient(cmap='Blues',subset=["confirmed"])\
.background_gradient(cmap='Reds',subset=["death"])\
.background_gradient(cmap='Greens',subset=["recovered"])\
.background_gradient(cmap='YlOrBr',subset=["active"])\
.background_gradient(cmap='Purples',subset=["active%"]))
Starting analysis with US as it is the top country with confirmed cases, following up with United Kingdom as it has the highest percentage of active cases. Finalizing this part of the analysis with contrary countries. Starting with China, having the least percentage of active cases, following with Germany, having the most recovered cases.
gplotbar(full_df,'US',cols=['confirmed','active','recovered','death'],daily=False,title = 'US Cases, Deaths, Recovered and Active cases on from 3/1/2020',startdate='3/1/2020')
#Creating a reshaped df with Case Type as one column
country_all = full_df.melt(id_vars = ['country','Date','ISO','confirmed%','mortality%','pop','confirmed_cat','death_cat','recovered_cat','density pop/km2','continent','active_cat','active%'],var_name = 'Case Type')
countryname= 'US'
fig = px.line(country_all[(country_all['country'] == countryname)],x = 'Date', y = 'value',color = 'Case Type',title=f'Progression of Case types for {countryname} through time')
fig.show()
gplotbar(full_df,daily=True,countryname='US',cols=['confirmed','active','recovered','death'],title='US Daily Cases, Recoveries and Deaths counts on Daily Basis from 3/1/2020',startdate='3/1/2020')
gplotbar(full_df,'United Kingdom',cols=['confirmed','active','death','recovered'],daily=False,title = 'UK Cases, Deaths, Recovered and Active cases from 3/1/2020',startdate='3/1/2020')
#Creating a reshaped df with Case Type as one column
country_all = full_df.melt(id_vars = ['country','Date','ISO','confirmed%','mortality%','pop','confirmed_cat','death_cat','recovered_cat','density pop/km2','continent','active_cat','active%'],var_name = 'Case Type')
countryname= 'United Kingdom'
fig = px.line(country_all[(country_all['country'] == countryname)],x = 'Date', y = 'value',color = 'Case Type',title=f'Progression of Case types for {countryname} through time')
fig.show()
gplotbar(full_df,daily=True,countryname='United Kingdom',cols=['confirmed','active','death','recovered'],title='UK Cases, Deaths, Recovered and Active cases on Daily Basis from 3/1/2020',startdate='3/1/2020')
gplotbar(full_df,daily=False,countryname='China',cols=['confirmed','active','recovered','death'],title='China Cases, Deaths, Recovered and Active cases from 1/22/2020')
#Creating a reshaped df with Case Type as one column
country_all = full_df.melt(id_vars = ['country','Date','ISO','confirmed%','mortality%','pop','confirmed_cat','death_cat','recovered_cat','density pop/km2','continent','active_cat','active%'],var_name = 'Case Type')
countryname= 'China'
fig = px.line(country_all[(country_all['country'] == countryname)],x = 'Date', y = 'value',color = 'Case Type',title=f'Total Case types for {countryname}')
fig.show()
gplotbar(full_df,daily=True,countryname='China',cols=['confirmed','active','recovered','death'],title='China Cases, Deaths, Recovered and Active cases on Daily Basis from 1/22/2020')
gplotbar(full_df,daily=False,countryname='Germany',cols=['confirmed','active','recovered','death'],title='Germany Cases, Deaths, Recovered and Active cases from 3/1/2020',startdate='3/1/2020')
#Creating a reshaped df with Case Type as one column
country_all = full_df.melt(id_vars = ['country','Date','ISO','confirmed%','mortality%','pop','confirmed_cat','death_cat','recovered_cat','density pop/km2','continent','active_cat','active%'],var_name = 'Case Type')
countryname= 'Germany'
fig = px.line(country_all[(country_all['country'] == countryname)],x = 'Date', y = 'value',color = 'Case Type',title=f'Progression of Case types for {countryname} through time')
fig.show()
gplotbar(full_df,daily=True,countryname='Germany',cols=['confirmed','active','recovered','death'],title='Germany Cases, Deaths, Recovered and Active cases on Daily Basis from 3/1/2020',startdate='3/1/2020')
The mortality rate is a critical indicator in such crises as it translates the percentage of deaths to confirmed cases and describes how aggressive the pandemic is to individual countries. In this section, Mortality rate is explored to find trends and patterns as well as analyse which countries are in more critical condition.
Sorting countries by mortality rates calculated from confirmed cases and deaths.
df_temp = full_df[(full_df.Date == full_df.Date.max()) & (full_df.confirmed > 1000)][['country','confirmed','death','recovered','density pop/km2','mortality%','confirmed%']].sort_values(by = 'mortality%',ascending = False)
df_temp.style.background_gradient(cmap='Blues',subset=["confirmed"])\
.background_gradient(cmap='Reds',subset=["death"])\
.background_gradient(cmap='Greens',subset=["recovered"])\
.background_gradient(cmap='Purples',subset=["density pop/km2"])\
.background_gradient(cmap='YlOrBr',subset=["mortality%"])\
.background_gradient(cmap='bone_r',subset=["confirmed%"])
df_data = full_df.groupby(['Date', 'country'])['confirmed', 'death','continent','mortality%'].max().reset_index()
df_data["date_reformated"] = pd.to_datetime( df_data["Date"]).dt.strftime('%m/%d/%Y')
fig = px.scatter(df_data, y='mortality%',
x= df_data["confirmed"],
range_y = [-1,18],
range_x = [1,df_data["confirmed"].max()+1000000],
color= "continent",
hover_name="country",
hover_data=["confirmed","death"],
range_color= [0, max(np.power(df_data["confirmed"],0.3))],
animation_frame="date_reformated",
animation_group="country",
color_continuous_scale=px.colors.sequential.Plasma,
title='Change in Mortality Rate of Each Countries Over Time',
size = np.power(df_data["confirmed"]+1,0.3)-0.5,
size_max = 30,
log_x=True,
height =700,
)
fig.update_coloraxes(colorscale="hot")
fig.update(layout_coloraxis_showscale=False)
fig.update_xaxes(title_text="Confirmed Cases (Log Scale)")
fig.update_yaxes(title_text="Mortality Rate (%)")
fig.show()
Exploring mortality rates between continents and their ordinary least squares to find insights on how they interact with each other through time.
df_data = full_df.groupby(['Date', 'country'])['confirmed', 'death','continent','mortality%'].max().reset_index()
df_data["date_reformated"] = pd.to_datetime( df_data["Date"]).dt.strftime('%m/%d/%Y')
df_data = df_data[(df_data.continent == 'Europe') | (df_data.continent == 'Africa')]
fig = px.scatter(df_data, trendline = 'ols', y='mortality%',
x= df_data["confirmed"],
range_y = [-1,18],
range_x = [1,df_data["confirmed"].max()+1000000],
color= "continent",
hover_name="country",
hover_data=["confirmed","death"],
range_color= [0, max(np.power(df_data["confirmed"],0.3))],
animation_frame="date_reformated",
animation_group="country",
color_continuous_scale=px.colors.sequential.Plasma,
title='Change in Mortality Rate of Europe and Africa Over Time',
size = np.power(df_data["confirmed"]+1,0.3)-0.5,
size_max = 30,
log_x=True,
height =700,
)
fig.update_coloraxes(colorscale="hot")
fig.update(layout_coloraxis_showscale=False)
fig.update_xaxes(title_text="Confirmed Cases (Log Scale)")
fig.update_yaxes(title_text="Mortality Rate (%)")
fig.show()
df_data = full_df.groupby(['Date', 'country'])['confirmed', 'death','continent','mortality%'].max().reset_index()
df_data["date_reformated"] = pd.to_datetime( df_data["Date"]).dt.strftime('%m/%d/%Y')
df_data = df_data[(df_data.continent == 'Europe') | (df_data.continent == 'Asia')]
fig = px.scatter(df_data, trendline = 'ols', y='mortality%',
x= df_data["confirmed"],
range_y = [-1,18],
range_x = [1,df_data["confirmed"].max()+1000000],
color= "continent",
hover_name="country",
hover_data=["confirmed","death"],
range_color= [0, max(np.power(df_data["confirmed"],0.3))],
animation_frame="date_reformated",
animation_group="country",
color_continuous_scale=px.colors.sequential.Plasma,
title='Change in Mortality Rate of Europe and Asia Over Time',
size = np.power(df_data["confirmed"]+1,0.3)-0.5,
size_max = 30,
log_x=True,
height =700,
)
fig.update_coloraxes(colorscale="hot")
fig.update(layout_coloraxis_showscale=False)
fig.update_xaxes(title_text="Confirmed Cases (Log Scale)")
fig.update_yaxes(title_text="Mortality Rate (%)")
fig.show()
Investigating high mortality rate in countries with correlation to high case count to find most critical countries affected by the current pandamic.
df_temp = full_df[(full_df.confirmed > 2500) & (full_df.Date == full_df.Date.max())][['country','confirmed','death','recovered','density pop/km2','mortality%','confirmed%']].sort_values(by = 'mortality%',ascending = False).nlargest(10,'mortality%')
df_temp.style.background_gradient(cmap='Blues',subset=["confirmed"])\
.background_gradient(cmap='Reds',subset=["death"])\
.background_gradient(cmap='Greens',subset=["recovered"])\
.background_gradient(cmap='Purples',subset=["density pop/km2"])\
.background_gradient(cmap='YlOrBr',subset=["mortality%"])\
.background_gradient(cmap='bone_r',subset=["confirmed%"])
filter_df = full_df[(full_df.Date >= full_df.Date.quantile(1).strftime('%m/%d/%Y')) & (full_df.confirmed > 2500) ][['Date','country','continent','confirmed','death','recovered','density pop/km2','mortality%','confirmed%']].sort_values(by = 'mortality%',ascending = False)
choice = filter_df.nlargest(10,'mortality%')['country']
df_temp = full_df.merge(choice,how = 'right',on = 'country')
df_temp['Date'] = pd.to_datetime(df_temp['Date']).dt.strftime('%m/%d/%Y')
fig = px.scatter(df_temp,trendline='ols', y='mortality%',
x = df_temp["confirmed"],
color= "continent",
hover_name="country",
hover_data=["confirmed","death"],
range_y = [-1,18],
range_x = [-1000,df_temp["confirmed"].max()+10000],
range_color= [0, max(np.power(df_temp["confirmed"],0.3))],
animation_frame=df_temp["Date"],
animation_group=df_temp["country"],
color_continuous_scale=px.colors.sequential.Plasma,
title='Change in Mortality Rate of Highest Mortality Countries Over Time',
size = np.power(df_temp["confirmed"]+1,0.3)-0.5,
size_max = 30,
log_x=False,
height =700
)
fig.update_coloraxes(colorscale="hot")
fig.update(layout_coloraxis_showscale=False)
fig.update_xaxes(title_text="Confirmed Cases")
fig.update_yaxes(title_text="Mortality Rate (%)")
fig.show()
df_temp1 = full_df[(full_df.Date == full_df.Date.max()) & (full_df.confirmed > 2500)].sort_values( by = 'mortality%').nlargest(10,'mortality%')
df_temp = full_df.merge(df_temp1,on = 'country')
fig = px.line(df_temp,x = 'Date_x', y = 'mortality%_x',color = 'country',title='Rate of mortality increase in highest mortality rate countries')
fig.update_layout(xaxis_title='Date',
yaxis_title="Mortality Rate (%)")
fig.show()
df_temp = full_df[(full_df.Date == full_df.Date.max()) & (full_df.confirmed > 200)][['country','continent','confirmed','death','recovered','density pop/km2','mortality%','pop','confirmed%']].sort_values(by = 'confirmed%',ascending = False).nlargest(10,'confirmed%')
df_temp.style.background_gradient(cmap='Blues',subset=["confirmed"])\
.background_gradient(cmap='Reds',subset=["death"])\
.background_gradient(cmap='Greens',subset=["recovered"])\
.background_gradient(cmap='Purples',subset=["density pop/km2"])\
.background_gradient(cmap='YlOrBr',subset=["mortality%"])\
.background_gradient(cmap='bone_r',subset=["confirmed%"])\
.background_gradient(cmap='Blues',subset=["pop"])
sub_df = full_df[full_df.Date == full_df.Date.max()].nlargest(10,'confirmed%')
sub_df
pxplotline(full_df,sub_df,'confirmed%',x ='Date',title='Countries with highest confirmed cases to total population ratio',hd=['pop','death','confirmed'])
df_temp = full_df[(full_df.Date == full_df.Date.max()) & (full_df.confirmed > 2500) & (full_df['density pop/km2'] < 3500)][['country','continent','confirmed','death','recovered','density pop/km2','mortality%','pop','confirmed%']].sort_values(by = 'density pop/km2',ascending = False).nlargest(20,'density pop/km2')
df_temp.style.background_gradient(cmap='Blues',subset=["confirmed"])\
.background_gradient(cmap='Reds',subset=["death"])\
.background_gradient(cmap='Greens',subset=["recovered"])\
.background_gradient(cmap='Purples',subset=["density pop/km2"])\
.background_gradient(cmap='YlOrBr',subset=["mortality%"])\
.background_gradient(cmap='bone_r',subset=["confirmed%"])\
.background_gradient(cmap='Blues',subset=["pop"])
Exploring confirmed and death cases between continents and their countries, and their ordinary least squares to find insights on how they interact with each other through time.
df_temp = full_df[full_df['Date']==full_df.Date.max()]
q3 = np.percentile(df_temp.confirmed,75)
q1 = np.percentile(df_temp.confirmed,25)
IQR = q3-q1
low = -q1 + 1.2*IQR
high = q3 + 2.5*IQR
df_temp = df_temp[(df_temp['confirmed']>low) & (df_temp['confirmed']<high)]
df_temp = df_temp[df_temp['density pop/km2'] < 3500]
df_temp = df_temp[df_temp['confirmed'] > 2500]
px.scatter(df_temp,trendline = 'ols',y='confirmed',x='density pop/km2', size = 'pop', color='continent',hover_data=['country'], title='Variation of Population density wrt Confirmed Cases')
px.scatter(df_temp,trendline = 'ols',y='death',x='density pop/km2', size = 'pop', color='continent',hover_data=['country'],title='Variation of Population density wrt death')
edu_full_df = full_df.merge(edu_df[['ISO','illiterate%']],on = 'ISO')
edu_full_df_temp = edu_full_df[edu_full_df.Date == edu_full_df.Date.max()].nlargest(20,'illiterate%')
edu_full_df_temp[['country','continent','confirmed','death','recovered','active','illiterate%']].style.background_gradient(cmap='Blues',subset=["confirmed"])\
.background_gradient(cmap='Reds',subset=["death"])\
.background_gradient(cmap='Greens',subset=["recovered"])\
.background_gradient(cmap='Purples',subset=["active"])\
.background_gradient(cmap='YlOrBr',subset=["illiterate%"])\
data = edu_df.sort_values('illiterate%',ascending = False)
fig, ax = plt.subplots(figsize = (70,30))
sns.barplot(data = data, x = 'country',y = 'illiterate%')
_=ax.set_xticklabels(labels=data.country, rotation=90)
plt.title('Percentage of illiteracy sorted');
edu_full_df = full_df.merge(edu_df[['ISO','illiterate%']],on = 'ISO')
edu_full_df_temp = edu_full_df[edu_full_df.Date == edu_full_df.Date.max()].nlargest(20,'illiterate%')
px.scatter(edu_full_df_temp,trendline="ols",x='confirmed',y='illiterate%', size = 'pop', color='continent',hover_data=['country','continent'],title='Variation of illiteracy rate wrt confirmed cases for top 20 illiterate countries')
edu_full_df = full_df.merge(edu_df[['ISO','illiterate%']],on = 'ISO')
edu_full_df_temp = edu_full_df[edu_full_df.Date == edu_full_df.Date.max()].nlargest(20,'illiterate%')
px.scatter(edu_full_df_temp,trendline="ols",y='death',x='illiterate%', size = 'pop', color='continent',hover_data=['country'],title='Variation of illiteracy rate wrt death cases for top 20 illiterate countries')
After a lengthy analysis of mentioned data, and several conclusions from varios sections of this project, we could safely assume the following:-
And Finally Stay Home and Stay Safe.